const char *cl_source_bfs_uiuc_spill =
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics: enable\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"\n"
"#define get_queue_index(tid) ((tid%NUM_P_PER_MP))\n"
"#define get_queue_offset(tid) ((tid%NUM_P_PER_MP)*W_Q_SIZE)\n"
"\n"
"//S. Xiao and W. Feng, .Inter-block GPU communication via fast barrier \n"
"//synchronization,.Technical Report TR-09-19, \n"
"//Dept. of Computer Science, Virginia Tech\n"
"// ****************************************************************************\n"
"// Function: __gpu_sync\n"
"//\n"
"// Purpose:\n"
"//   Implements global barrier synchronization across thread blocks. Thread \n"
"//   blocks must be limited to number of multiprocessors available\n"
"//\n"
"// Arguments:\n"
"//   blocks_to_synch: the number of blocks across which to implement the barrier\n"
"//   g_mutex: keeps track of number of blocks that are at barrier\n"
"//\n"
"// Returns:  nothing\n"
"//\n"
"// Programmer: Aditya Sarwade\n"
"// Creation: June 16, 2011\n"
"//\n"
"// Modifications:\n"
"//\n"
"// ****************************************************************************\n"
"void __gpu_sync(int blocks_to_synch , volatile __global unsigned int *g_mutex)\n"
"{\n"
"    //thread ID in a block\n"
"    barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);\n"
"    int tid_in_block= get_local_id(0);\n"
"    \n"
"\n"
"    // only thread 0 is used for synchronization\n"
"    if (tid_in_block == 0) \n"
"    {\n"
"        atom_add(g_mutex, 1);               \n"
"        //only when all blocks add 1 to g_mutex will\n"
"        //g_mutex equal to blocks_to_synch\n"
"        while(g_mutex[0] < blocks_to_synch)\n"
"        {\n"
"        }\n"
"\n"
"    }\n"
"    barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);\n"
"}\n"
"\n"
"\n"
"//An Effective GPU Implementation of Breadth-First Search, Lijuan Luo,\n"
"//Martin Wong,Wen-mei Hwu ,\n"
"//Department of Electrical and Computer Engineering, \n"
"//University of Illinois at Urbana-Champaign\n"
"// ****************************************************************************\n"
"// Function: BFS_kernel_one_block\n"
"//\n"
"// Purpose:\n"
"//   Perform BFS on the given graph when the frontier length is within one\n"
"//   thread block (i.e max number of threads per block)\n"
"//\n"
"// Arguments:\n"
"//   frontier: array that stores the vertices to visit in the current level \n"
"//   frontier_len: length of the given frontier array \n"
"//   visited: mask that tells if a vertex is currently in frontier\n"
"//   cost: array that stores the cost to visit each vertex \n"
"//   edgeArray: array that gives offset of a vertex in edgeArrayAux\n"
"//   edgeArrayAux: array that gives the edge list of a vertex \n"
"//   numVertices: number of vertices in the given graph \n"
"//   numEdges: number of edges in the given graph \n"
"//   frontier_length: length of the new frontier array\n"
"//   max_local_mem: max size of the shared memory queue\n"
"//   b_q: block level queue\n"
"//   b_q2: alterante block level queue\n"
"//\n"
"// Returns:  nothing\n"
"//\n"
"// Programmer: Aditya Sarwade\n"
"// Creation: June 16, 2011\n"
"//\n"
"// Modifications:\n"
"//\n"
"// ****************************************************************************\n"
"__kernel void BFS_kernel_one_block(\n"
"\n"
"    volatile __global unsigned int *frontier,\n"
"    unsigned int frontier_len,\n"
"    volatile __global int *visited,\n"
"    volatile __global unsigned int *cost,\n"
"    __global unsigned int *edgeArray,\n"
"    __global unsigned int *edgeArrayAux,\n"
"    unsigned int numVertices,\n"
"    unsigned int numEdges,\n"
"    volatile __global unsigned int *frontier_length,\n"
"    const unsigned int max_local_mem,\n"
"\n"
"    //the block queues of size MAX_THREADS_PER_BLOCK\n"
"    volatile __local unsigned int *b_q,\n"
"    volatile __local unsigned int *b_q2)\n"
"{\n"
"    volatile __local unsigned int b_offset[1];\n"
"    volatile __local unsigned int b_q_length[1];\n"
"\n"
"    //get the threadId\n"
"    unsigned int tid = get_local_id(0);\n"
"    //copy frontier queue from global queue to local block queue\n"
"    if(tid<frontier_len)\n"
"    {\n"
"        b_q[tid]=frontier[tid];\n"
"    }\n"
"\n"
"    unsigned int f_len=frontier_len;\n"
"    barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);\n"
"    while(1)\n"
"    {\n"
"        //Initialize the block queue size to 0\n"
"        if(tid==0)\n"
"        {\n"
"            b_q_length[0]=0;\n"
"            b_offset[0]=0;\n"
"        }\n"
"        barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);\n"
"        if(tid<f_len)\n"
"        {\n"
"            //get the nodes to traverse from block queue\n"
"            unsigned int node_to_process=b_q[tid];\n"
"            \n"
"            visited[node_to_process]=0;\n"
"            //get the offsets of the vertex in the edge list\n"
"            unsigned int offset = edgeArray[node_to_process];\n"
"            unsigned int next   = edgeArray[node_to_process+1];\n"
"\n"
"            //Iterate through the neighbors of the vertex\n"
"            while(offset<next)\n"
"            {\n"
"                //get neighbor\n"
"                unsigned int nid=edgeArrayAux[offset];\n"
"                //get its cost\n"
"                unsigned int v=atom_min(&cost[nid],cost[node_to_process]+1);\n"
"                //if cost is less than previously set add to frontier\n"
"                if(v>cost[node_to_process]+1)\n"
"                {\n"
"                    int is_in_frontier=atom_xchg(&visited[nid],1);\n"
"                    //if node already in frontier do nothing\n"
"                    if(is_in_frontier==0)\n"
"                    {\n"
"                            //increment the local queue size\n"
"                            unsigned int t=atom_add(&b_q_length[0],1);\n"
"                            if(t< max_local_mem)\n"
"                            {\n"
"                                b_q2[t]=nid;\n"
"                            }\n"
"                            //write to global memory if shared memory full\n"
"                            else\n"
"                            {\n"
"                                int off=atom_add(&b_offset[0],1);\n"
"                                frontier[off]=nid;\n"
"                            }\n"
"                        }\n"
"                }\n"
"                offset++;\n"
"            }\n"
"        }\n"
"        barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);\n"
"        //copy block queue from b_q2 to b_q\n"
"        if(tid<max_local_mem)\n"
"            b_q[tid]=b_q2[tid];\n"
"        barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);\n"
"        //if traversal complete exit\n"
"        if(b_q_length[0]==0)\n"
"        { \n"
"            if(tid==0)\n"
"                frontier_length[0]=0;\n"
"\n"
"            return;\n"
"        }\n"
"        // if frontier exceeds one block in size copy block queue to\n"
"        //global queue and exit\n"
"        else if( b_q_length[0] > get_local_size(0) || \n"
"                 b_q_length[0] > max_local_mem)\n"
"        {\n"
"            if(tid<(b_q_length[0]-b_offset[0]))\n"
"                frontier[b_offset[0]+tid]=b_q[tid];\n"
"            if(tid==0)\n"
"            {\n"
"                frontier_length[0] = b_q_length[0];\n"
"            }\n"
"            return; \n"
"        }\n"
"        f_len=b_q_length[0];\n"
"        barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);\n"
"    }\n"
"}\n"
"\n"
"// ****************************************************************************\n"
"// Function: BFS_kernel_SM_block\n"
"//\n"
"// Purpose:\n"
"//   Perform BFS on the given graph when the frontier length is greater than \n"
"//   one thread block but less than number of Streaming Multiprocessor(SM) \n"
"//   thread blocks (i.e max threads per block * SM blocks)\n"
"//\n"
"// Arguments:\n"
"//   frontier: array that stores the vertices to visit in the current level \n"
"//   frontier_len: length of the given frontier array \n"
"//   frontier2: alternate frontier array\n"
"//   visited: mask that tells if a vertex is currently in frontier\n"
"//   cost: array that stores the cost to visit each vertex \n"
"//   edgeArray: array that gives offset of a vertex in edgeArrayAux\n"
"//   edgeArrayAux: array that gives the edge list of a vertex \n"
"//   numVertices: number of vertices in the given graph \n"
"//   numEdges: number of edges in the given graph \n"
"//   frontier_length: length of the new frontier array\n"
"//   g_mutex: mutex for implementing global barrier \n"
"//   g_mutex2: gives the starting value of the g_mutex used in global barrier \n"
"//   g_q_offsets: gives the offset of a block in the global queue\n"
"//   g_q_size: keeps track of the size of frontier in intermediate iterations\n"
"//   max_local_mem: max size of the shared memory queue\n"
"//   b_q: block level queue\n"
"//\n"
"//\n"
"// Returns:  nothing\n"
"//\n"
"// Programmer: Aditya Sarwade\n"
"// Creation: June 16, 2011\n"
"//\n"
"// Modifications:\n"
"//\n"
"// ****************************************************************************\n"
"__kernel void BFS_kernel_SM_block(\n"
"\n"
"    volatile __global unsigned int *frontier,\n"
"    unsigned int frontier_len,\n"
"    volatile __global unsigned int *frontier2,\n"
"    volatile __global int *visited, \n"
"    volatile __global unsigned int *cost,\n"
"    __global unsigned int *edgeArray, \n"
"    __global unsigned int *edgeArrayAux,\n"
"    unsigned int numVertices, \n"
"    unsigned int numEdges, \n"
"    volatile __global unsigned int *frontier_length,\n"
"    volatile __global unsigned int *g_mutex, \n"
"    volatile __global unsigned int *g_mutex2, \n"
"    volatile __global unsigned int *g_q_offsets, \n"
"    volatile __global unsigned int *g_q_size,\n"
"    const unsigned int max_local_mem,\n"
"\n"
"    //block queue\n"
"    volatile __local unsigned int *b_q)\n"
"{\n"
"\n"
"    volatile __local unsigned int b_q_length[1];\n"
"    volatile __local unsigned int b_offset[1];\n"
"    //get the threadId\n"
"    unsigned int tid=get_global_id(0);\n"
"    unsigned int lid=get_local_id(0);\n"
"\n"
"    int loop_index=0;\n"
"    unsigned int l_mutex=g_mutex2[0];\n"
"    unsigned int f_len=frontier_len;\n"
"    barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);\n"
"    while(1)\n"
"    {\n"
"        //Initialize the block queue size to 0\n"
"        if (lid==0)\n"
"        {\n"
"            b_q_length[0]=0;\n"
"            b_offset[0]=0;\n"
"        }\n"
"        barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);\n"
"        if(tid<f_len)\n"
"        {\n"
"            unsigned int node_to_process;  \n"
"            \n"
"            //get the node to traverse from block queue\n"
"            if(loop_index==0)\n"
"               node_to_process=frontier[tid];\n"
"            else\n"
"               node_to_process=frontier2[tid]; \n"
"\n"
"            //node removed from frontier\n"
"            visited[node_to_process]=0;\n"
"            //get the offsets of the vertex in the edge list\n"
"            unsigned int offset=edgeArray[node_to_process];\n"
"            unsigned int next=edgeArray[node_to_process+1];\n"
"\n"
"            //Iterate through the neighbors of the vertex\n"
"            while(offset<next)\n"
"            {\n"
"                //get neighbor\n"
"                unsigned int nid=edgeArrayAux[offset];\n"
"                //get its cost\n"
"                unsigned int v=atom_min(&cost[nid],cost[node_to_process]+1);\n"
"                //if cost is less than previously set add to frontier\n"
"                if(v>cost[node_to_process]+1)\n"
"                {\n"
"                    int is_in_frontier=atom_xchg(&visited[nid],1);\n"
"                    //if node already in frontier do nothing\n"
"                    if(is_in_frontier==0)\n"
"                    {\n"
"                        //increment the warp queue size\n"
"                        unsigned int t=atom_add(&b_q_length[0],1);\n"
"                        if(t<max_local_mem)\n"
"                        {\n"
"                            b_q[t]=nid;\n"
"                        }\n"
"                        //write to global memory if shared memory full\n"
"                        else\n"
"                        {\n"
"                            int off=atom_add(g_q_offsets,1);\n"
"                            if(loop_index==0)\n"
"                                frontier2[off]=nid;\n"
"                            else\n"
"                                frontier[off]=nid;\n"
"                        } \n"
"                    }\n"
"                }\n"
"                offset++;\n"
"            }\n"
"        }\n"
"\n"
"        barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);\n"
"        //get block queue offset in global queue\n"
"        if(lid==0)\n"
"        {\n"
"            if(b_q_length[0] > max_local_mem)\n"
"            {\n"
"                b_q_length[0] = max_local_mem;\n"
"            }\n"
"            b_offset[0]=atom_add(g_q_offsets,b_q_length[0]);\n"
"        }\n"
"\n"
"        //global barrier\n"
"        barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);\n"
"		l_mutex+=get_num_groups(0);\n"
"		__gpu_sync(l_mutex,g_mutex);\n"
"\n"
"        //store frontier size\n"
"        if(tid==0)\n"
"        {\n"
"            g_q_size[0]=g_q_offsets[0];\n"
"            g_q_offsets[0]=0;\n"
"        }\n"
"\n"
"        //copy block queue to global queue\n"
"        if(lid < b_q_length[0])\n"
"        {\n"
"            if(loop_index==0)\n"
"                frontier2[lid+b_offset[0]]=b_q[lid];\n"
"            else\n"
"                frontier[lid+b_offset[0]]=b_q[lid];\n"
"        }\n"
"        \n"
"        //global barrier\n"
"		l_mutex+=get_num_groups(0);\n"
"		__gpu_sync(l_mutex,g_mutex);\n"
"\n"
"        //exit if frontier size exceeds SM blocks or is less than 1 block\n"
"        if(g_q_size[0] < get_local_size(0) ||\n"
"            g_q_size[0] > get_local_size(0) * get_num_groups(0))\n"
"                break;                                                  \n"
"\n"
"        loop_index=(loop_index+1)%2;\n"
"        //store the current frontier size\n"
"        f_len=g_q_size[0];\n"
"    }\n"
"\n"
"    if(loop_index==0)\n"
"    {\n"
"        for(int i=tid;i<g_q_size[0];i += get_global_size(0))\n"
"               frontier[i]=frontier2[i];\n"
"    }\n"
"    if(tid==0)\n"
"    {\n"
"        frontier_length[0]=g_q_size[0];\n"
"    }\n"
"}\n"
"\n"
"// ****************************************************************************\n"
"// Function: BFS_kernel_multi_block\n"
"//\n"
"// Purpose:\n"
"//   Perform BFS on the given graph when the frontier length is greater than \n"
"//   than number of Streaming Multiprocessor(SM) thread blocks \n"
"//   (i.e max threads per block * SM blocks)\n"
"//\n"
"// Arguments:\n"
"//   frontier: array that stores the vertices to visit in the next level \n"
"//   frontier_len: length of the given frontier array \n"
"//   frontier2: used with frontier in even odd loops\n"
"//   visited: mask that tells if a vertex is currently in frontier\n"
"//   cost: array that stores the cost to visit each vertex \n"
"//   edgeArray: array that gives offset of a vertex in edgeArrayAux\n"
"//   edgeArrayAux: array that gives the edge list of a vertex \n"
"//   numVertices: number of vertices in the given graph \n"
"//   numEdges: number of edges in the given graph \n"
"//   frontier_length: length of the new frontier array\n"
"//   max_local_mem: max size of the shared memory queue\n"
"//   b_q: block level queue\n"
"//\n"
"//\n"
"// Returns:  nothing\n"
"//\n"
"// Programmer: Aditya Sarwade\n"
"// Creation: June 16, 2011\n"
"//\n"
"// Modifications:\n"
"//\n"
"// ****************************************************************************\n"
"__kernel void BFS_kernel_multi_block(\n"
"\n"
"    volatile __global unsigned int *frontier,\n"
"    unsigned int frontier_len,\n"
"    volatile __global unsigned int *frontier2,\n"
"    volatile __global int *visited,\n"
"    volatile __global unsigned int *cost,\n"
"    __global unsigned int *edgeArray,\n"
"    __global unsigned int *edgeArrayAux,\n"
"    unsigned int numVertices,\n"
"    unsigned int numEdges,\n"
"    volatile __global unsigned int *frontier_length,\n"
"    const unsigned int max_local_mem,\n"
"\n"
"    volatile __local unsigned int *b_q)\n"
"{\n"
"    volatile __local unsigned int b_q_length[1];\n"
"    volatile __local unsigned int b_offset[1];\n"
"\n"
"    //get the threadId\n"
"    unsigned int tid=get_global_id(0);\n"
"    unsigned int lid=get_local_id(0);\n"
"\n"
"    //initialize the block queue length\n"
"    if (lid == 0)\n"
"    {\n"
"        b_q_length[0]=0;\n"
"        b_offset[0]=0;\n"
"    }\n"
"\n"
"    barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);\n"
"    if(tid<frontier_len)\n"
"    {\n"
"        //get the nodes to traverse from block queue\n"
"        unsigned int node_to_process=frontier[tid];  \n"
"        visited[node_to_process]=0;\n"
"        //get the offsets of the vertex in the edge list\n"
"        unsigned int offset=edgeArray[node_to_process];\n"
"        unsigned int next=edgeArray[node_to_process+1];\n"
"\n"
"        //Iterate through the neighbors of the vertex\n"
"        while(offset<next)\n"
"        {\n"
"            //get neighbor\n"
"            unsigned int nid=edgeArrayAux[offset];\n"
"            //get its cost\n"
"            unsigned int v=atom_min(&cost[nid],cost[node_to_process]+1);\n"
"            //if cost is less than previously set add to frontier\n"
"            if(v>cost[node_to_process]+1)\n"
"            {\n"
"                int is_in_frontier=atom_xchg(&visited[nid],1);\n"
"                //if node already in frontier do nothing\n"
"                if(is_in_frontier==0)\n"
"                {\n"
"                        //increment the warp queue size\n"
"                        unsigned int t=atom_add(&b_q_length[0],1);\n"
"                        if(t<max_local_mem)\n"
"                        {\n"
"                            b_q[t]=nid;\n"
"                        }\n"
"                        //write to global memory if shared memory full\n"
"                        else\n"
"                        {\n"
"                            int off=atom_add(frontier_length,1);\n"
"                            frontier2[off]=nid;\n"
"                        } \n"
"                }\n"
"            }\n"
"            offset++;\n"
"        }\n"
"    }\n"
"    barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);\n"
"\n"
"    //get block queue offset in global queue\n"
"    if(lid==0)\n"
"    {\n"
"        if(b_q_length[0] > max_local_mem)\n"
"        {\n"
"                b_q_length[0]=max_local_mem;\n"
"        }\n"
"        b_offset[0]=atom_add(frontier_length,b_q_length[0]);\n"
"    }\n"
"\n"
"    barrier(CLK_LOCAL_MEM_FENCE|CLK_GLOBAL_MEM_FENCE);\n"
"    //copy block queue to global queue\n"
"    if(lid < b_q_length[0])\n"
"        frontier2[lid+b_offset[0]]=b_q[lid];\n"
"\n"
"}\n"
"\n"
"// ****************************************************************************\n"
"// Function: Reset_kernel_parameters\n"
"//\n"
"// Purpose:\n"
"//   Reset the global variables\n"
"//\n"
"// Arguments:\n"
"//   frontier_length: length of the new frontier array\n"
"//   g_mutex: mutex for implementing global barrier \n"
"//   g_mutex2: gives the starting value of the g_mutex used in global barrier \n"
"//   g_q_offsets: gives the offset of a block in the global queue\n"
"//   g_q_size: size of the global queue \n"
"//\n"
"// Returns:  nothing\n"
"//\n"
"// Programmer: Aditya Sarwade\n"
"// Creation: June 16, 2011\n"
"//\n"
"// Modifications:\n"
"//\n"
"// ****************************************************************************\n"
"__kernel void Reset_kernel_parameters(\n"
"\n"
"    __global unsigned int *frontier_length, \n"
"    __global volatile int *g_mutex, \n"
"    __global volatile int *g_mutex2, \n"
"    __global volatile int *g_q_offsets, \n"
"    __global volatile int *g_q_size)\n"
"{\n"
"    g_mutex[0]=0;\n"
"    g_mutex2[0]=0;\n"
"    *frontier_length=0;\n"
"    *g_q_offsets=0;\n"
"    g_q_size[0]=0;\n"
"}\n"
"\n"
"// ****************************************************************************\n"
"// Function: Frontier_copy\n"
"//\n"
"// Purpose:\n"
"//   Copy frontier2 data to frontier\n"
"//\n"
"// Arguments:\n"
"//   frontier: array that stores the vertices to visit in the current level \n"
"//   frontier2: alternate frontier array\n"
"//   frontier_length: length of the frontier array\n"
"//   g_mutex: mutex for implementing global barrier \n"
"//   g_mutex2: gives the starting value of the g_mutex used in global barrier \n"
"//   g_q_offsets: gives the offset of a block in the global queue\n"
"//   g_q_size: size of the global queue \n"
"//\n"
"// Returns:  nothing\n"
"//\n"
"// Programmer: Aditya Sarwade\n"
"// Creation: June 16, 2011\n"
"//\n"
"// Modifications:\n"
"//\n"
"// ****************************************************************************\n"
"__kernel void Frontier_copy(\n"
"    __global unsigned int *frontier, \n"
"    __global unsigned int *frontier2, \n"
"    __global unsigned int *frontier_length,  \n"
"    __global volatile int *g_mutex, \n"
"    __global volatile int *g_mutex2, \n"
"    __global volatile int *g_q_offsets, \n"
"    __global volatile int *g_q_size)\n"
"{\n"
"    unsigned int tid=get_global_id(0);\n"
"\n"
"    if(tid<*frontier_length)\n"
"    {\n"
"        frontier[tid]=frontier2[tid];\n"
"    }\n"
"}\n"
;
